{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import requests\n",
    "import re\n",
    "import bs4\n",
    "from bs4 import BeautifulSoup\n",
    "import urllib.request\n",
    " \n",
    "#获取一个中英文混合的字符串文本的字符宽度部分\n",
    "widths = [\n",
    "  (126,  1), (159,  0), (687,   1), (710,  0), (711,  1),\n",
    "  (727,  0), (733,  1), (879,   0), (1154, 1), (1161, 0),\n",
    "  (4347,  1), (4447,  2), (7467,  1), (7521, 0), (8369, 1),\n",
    "  (8426,  0), (9000,  1), (9002,  2), (11021, 1), (12350, 2),\n",
    "  (12351, 1), (12438, 2), (12442,  0), (19893, 2), (19967, 1),\n",
    "  (55203, 2), (63743, 1), (64106,  2), (65039, 1), (65059, 0),\n",
    "  (65131, 2), (65279, 1), (65376,  2), (65500, 1), (65510, 2),\n",
    "  (120831, 1), (262141, 2), (1114109, 1),\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_width(a):\n",
    "  global widths\n",
    "  if a == 0xe or a == 0xf:\n",
    "    return 0\n",
    "  for num, wid in widths:\n",
    "    if a <= num:\n",
    "      return wid\n",
    "  return 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def length(str):\n",
    "    sum = 0\n",
    "    for ch in str:\n",
    "        sum += get_width(ord(ch))\n",
    "    return sum"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#获取HTML文本\n",
    "def getHTMLText(url):\n",
    "    try:\n",
    "        #模拟浏览器\n",
    "        kv = {'user-agent':'Mozilla/5.0'}\n",
    "        r = requests.get(url, headers=kv, timeout=30)  \n",
    "        r.raise_for_status()\n",
    "        r.encoding = r.apparent_encoding\n",
    "        return r.text\n",
    "    except:\n",
    "        print(\"InternetError!\")\n",
    "        return \" \"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "#解析并且返回HTML文本\n",
    "def HTMLTextconvert(html):\n",
    "    try:\n",
    "        soup = bs4.BeautifulSoup(html, \"html.parser\")\n",
    "        return soup\n",
    "    except:\n",
    "        print(\"HTMLConvertError!\")\n",
    "        return \" \"\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def contents(links): \n",
    "     # Retrieve first link\n",
    "    cols = []\n",
    "    \n",
    "    link = 'https://s.weibo.com'+links[0]\n",
    "    \n",
    "    url = urllib.request.urlopen(link).read()\n",
    "    soup = bs4.BeautifulSoup(url,'lxml')\n",
    "\n",
    "    all_contents = soup.findAll(\"div\", {\"class\": \"content\"})\n",
    "    \n",
    "    \n",
    "    for content in all_contents:\n",
    "        if content.find(\"p\", attrs={'node-type': 'feed_list_content_full', 'class': 'txt'}):\n",
    "            expended_content = content.find(\"p\", attrs={'node-type': 'feed_list_content_full', 'class': 'txt'}).get_text().replace(\"收起全文d\", \"\")\n",
    "            cols.append(expended_content)\n",
    "        else:\n",
    "            cols.append(content.find(\"p\", {\"class\": \"txt\"}).get_text())\n",
    "    \n",
    "    \n",
    "    return cols\n",
    "\n",
    "    for c in cols:\n",
    "        print(c)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "#检索HTML中的信息，获取搜索排名信息\n",
    "#存在置顶的情况，需要特殊判断\n",
    "def HTMLSearch(html, ranklist):\n",
    "    links = []\n",
    "    try:\n",
    "         flag = 0\n",
    "         #找到所有tbody标签下的所有内容，并且遍历所有的儿子节点\n",
    "         for tr in html.find(\"tbody\").children:\n",
    "            #添加判断：获得的内容是否为标签Tag类型\n",
    "            if isinstance(tr, bs4.element.Tag):\n",
    "                #使用flag特判置顶的情况\n",
    "                if flag==0:\n",
    "                    rank = \"置顶\"\n",
    "                    #注意由于class属性会和python中的关键字重名，因此需要变为class_\n",
    "                    td02 = tr.find_all(class_=re.compile('td-02'))\n",
    "                    for i in td02:\n",
    "                        if isinstance(i, bs4.element.Tag):\n",
    "                            #trans得到的类型为列表\n",
    "                            trans = i.find_all(\"a\")\n",
    "                    number = \" \"\n",
    "                    ranklist.append([rank, trans[0].string, number])        \n",
    "                    flag = 1\n",
    "                else:\n",
    "                    #排名信息在td标签下的class=td-01属性中\n",
    "                    td01 = tr.find_all(class_=re.compile(\"td-01\"))\n",
    "                    for i in td01:\n",
    "                        if isinstance(i, bs4.element.Tag):\n",
    "                            rank = i.string\n",
    "                    #热搜内容和搜索量在td标签下的class=td-02属性中：内容是a标签，搜索量是span标签\n",
    "\n",
    "                    td02 = tr.find_all(class_=re.compile(\"td-02\"))\n",
    "                    for i in td02:\n",
    "                        name = i.find('a', href=True)['href']\n",
    "                        links.append(name)\n",
    "    except:\n",
    "        print(\"HTMLSearchError!\")\n",
    "\n",
    "    contents(links)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "#打印排名\n",
    "def Rankprint(ranklist, num):\n",
    "    try:\n",
    "        #先打印表头,总长为70个字符，其中{1}和{3}是变化的空格数量计算，默认为：\n",
    "        #排名*4，空格*3，名称*50，空格*5，点击量*8\n",
    "        a = \" \"\n",
    "        print(\"——————————————————————————————————————\")\n",
    "        print(\"{0}{1}{2}{3}{4}\\n\".format(\"排名\", a*5, \"热搜内容\", a*45, \"搜索量\"+a*2))\n",
    "        #使用flag来判断是否输出了置顶内容\n",
    "        flag = 0\n",
    "        for i in range(num):\n",
    "            if flag==0:\n",
    "#                 print(\"{0}{1}{2}\\n\".format(ranklist[i][0], a*3, ranklist[i][1]))\n",
    "                flag = 1\n",
    "            else:\n",
    "                #c是排名有一位、两位的数字，用来纠正空格\n",
    "                c = 7-len(ranklist[i][0])\n",
    "                #根据内容来随时计算所要填充的空格数量b\n",
    "                str = ranklist[i][1]\n",
    "                b = 62-length(ranklist[i][1])-len(ranklist[i][0])-c\n",
    "                print(\"{0}{1}{2}{3}{4:<8}\".format(ranklist[i][0], a*c, ranklist[i][1], a*b, ranklist[i][2]))\n",
    "        print(\"\\n\")\n",
    "    except:\n",
    "        print(\"RankPrintError!\")\n",
    " "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "#主函数\n",
    "def main():\n",
    "    try:\n",
    "        #微博热搜的网站\n",
    "        url = \"https://s.weibo.com/top/summary?Refer=top_hot&topnav=1&wvr=6\"\n",
    "        #使用二维列表存储每一条热搜信息的rank信息和内容\n",
    "        rank = []\n",
    "        text = getHTMLText(url)\n",
    "        soup = HTMLTextconvert(text)\n",
    "        HTMLSearch(soup, rank)\n",
    "        #contents(links)\n",
    "        #Rankprint(rank, 51)\n",
    "    except:\n",
    "        print(\"SystemError!\")\n",
    "        return 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Execute Program\n",
    "main()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
